full_set <- rbindlist(lapply(c(1:4), function(i){
  read_xlsx("raw_data/Gun Violence Archive - Brennan Center - Kevin Morris - Incidents - Interval searching - 201605-202305.xlsx",
            sheet = i, skip = 6)
})) |> 
  mutate(gang = grepl("gang", tolower(`Incident Characteristics`)),
         dv = grepl("domestic viol", tolower(`Incident Characteristics`)),
         home = grepl("home", tolower(`Incident Characteristics`)),
         mass_shooting = grepl("mass shooting", tolower(`Incident Characteristics`)),
         vics = `Victims Killed` + `Victims Injured`,
         city = `City Or County`,
         political = grepl("political", tolower(`Incident Characteristics`)),
         hate = grepl("hate", tolower(`Incident Characteristics`)),
         sex = grepl("sex", tolower(`Incident Characteristics`))) |> 
  select(id = `Incident ID`,
         lat = Latitude,
         lon = Longitude,
         date = Date,
         gang, dv, home, state = State,
         deaths = `Victims Killed`,
         mass_shooting,
         vics,
         city, political, hate, sex) |>
  filter(mass_shooting | vics >= 3)


zips <- tigris::zctas()

points <- st_as_sf(SpatialPointsDataFrame(cbind(full_set$lon, full_set$lat), full_set))
points <- st_set_crs(points, st_crs(zips))

points <- st_join(points, zips, join = st_within)
points <- points[,c(1:14)] |> 
  st_drop_geometry()

saveRDS(points, "temp/geocoded_shootings_from_gva.rds")

##########################
# read coded killings keep only well-coded killings
full_set <- readRDS("temp/geocoded_shootings_from_gva.rds") |> 
  select(-state) |> 
  rename(longitude = lon,
         latitude = lat,
         id2 = id) |> 
  ungroup()

## create function to find closest killing on any given day
find_closest <- function(bg_data_f, centroids_f, d){
  d = as.Date(d)
  
  three_four <- lapply(c(3, 4, 5), function(vic_count){
    ## keep killings occurring on that day
    sites <- filter(full_set,
                    date == d, vics >= vic_count) %>%
      mutate(id = row_number())
    
    if(nrow(sites) > 0){ #if there were any killings on that day, do the following:
      ## create tree of those killings
      tree <- createTree(coordinates(dplyr::select(sites, x = longitude, y = latitude)))
      
      ## find closest killing to each point in centroids_f
      inds <- knnLookup(tree , newdat = coordinates(centroids_f), k = 1)
      
      ## combine killing info and BG info
      bg_data_f <- left_join(cbind(bg_data_f, inds),
                             dplyr::select(sites, id, longitude, latitude, date, id2),
                             by = c("inds" = "id"))
      
      ## calculate distance between BG and killing, keep those within 20 miles
      dist <- data.table(dist = pointDistance(dplyr::select(bg_data_f, INTPTLON, INTPTLAT),
                                              dplyr::select(bg_data_f, longitude, latitude), lonlat = T) * 0.000621371,
                         date = as.Date(bg_data_f$date),
                         id = bg_data_f$id2,
                         GEOID = bg_data_f$GEOID) %>% 
        filter(dist <= 20)
    
    }else{
      dist <- data.table(dist = double(),
                         date = as.Date(character()),
                         id = integer(),
                         GEOID = character())
    }
  })
  
  dist <- full_join(three_four[[1]] |> 
                      rename(dist3 = dist,
                             id3 = id),
                    full_join(three_four[[2]] |> 
                                rename(dist4 = dist,
                                       id4 = id),
                              three_four[[3]] |> 
                                rename(dist5 = dist,
                                       id5 = id)))
  return(dist)
}

## this will keep one observation for every block group for every shooting within 20 miles
## over the 2 year-long periods
## loop over every state


assess <- function(s){
  # if(!(file.exists(paste0("temp/bgs_dists_new_", s, ".rds")))){
  library(tigris)
  library(sp)
  library(SearchTrees)
  library(raster)
  library(data.table)
  library(tidyverse)
  ## pull BG shapefiles using tigris package
  bgs <- block_groups(state = s, class = "sp", year = 2019)

  centroids <- SpatialPoints(
    data.table(x = as.numeric(bgs@data$INTPTLON), y = as.numeric(bgs@data$INTPTLAT))
  )


  bg_data <- bgs@data %>%
    mutate_at(vars(INTPTLON, INTPTLAT), as.numeric)

  #########################################
  ## loop over every day between Jan 1, 2020, and Election day
  tot <- rbindlist(lapply(seq(as.Date("2020-05-03"), as.Date("2021-05-02"), by="days"), function(d){
    l <- find_closest(bg_data, centroids, d)
  }))

  tot2 <- rbindlist(lapply(seq(as.Date("2016-05-08"), as.Date("2017-05-07"), by="days"), function(d){
    l <- find_closest(bg_data, centroids, d)
  }))

  tot <- bind_rows(tot, tot2,
                   )

  saveRDS(tot, paste0("temp/bgs_dists_new__from_gva", s, ".rds"))

  
  pings  <- SpatialPoints(full_set[,c('longitude','latitude')], proj4string = bgs@proj4string)
  full_set$GEOID <- over(pings, bgs)$GEOID
  
  in_state <- full_set |> 
    filter(!is.na(GEOID)) |> 
    select(id = id2, date, GEOID)
  
  saveRDS(in_state, paste0("temp/shooting_home_bg_", s, ".rds"))
  
}

cl <- makeCluster(12)  
registerDoParallel(cl)

clusterExport(cl, list("find_closest", "assess", "full_set"))

sts <- unique(filter(fips_codes, state_code <= 56)$state_code)

parLapply(cl, sts, fun = assess)

cleanup()

files <- list.files(path = "temp/", pattern = "^bgs_dists_new__from*", full.names = T)

all_bgs <- rbindlist(lapply(files, readRDS))

files <- list.files(path = "temp/", pattern = "^shooting_home_*", full.names = T)

homes1 <- rbindlist(lapply(files, readRDS)) |> 
  filter((date >= "2016-05-08" & date <= "2017-05-07") |
           (date >= "2020-05-03" & date <= "2021-05-02")) |> 
  select(date, GEOID, id) |> 
  mutate(d2 = 0)

full_set <- readRDS("temp/geocoded_shootings_from_gva.rds") |> 
  select(id, vics)

homes <- left_join(homes1, full_set) |> 
  filter(vics >= 3) |> 
  select(date, GEOID, d2) |> 
  distinct()

all_bgs <- full_join(all_bgs, homes) |> 
  mutate(dist = ifelse(is.na(d2), dist3, d2)) |> 
  select(-d2)

homes <- left_join(homes1, full_set) |> 
  filter(vics >= 4) |> 
  select(date, GEOID, d2) |> 
  distinct()

all_bgs <- full_join(all_bgs, homes) |> 
  mutate(dist = ifelse(is.na(d2), dist4, d2)) |> 
  select(-d2)

homes <- left_join(homes1, full_set) |> 
  filter(vics >= 5) |> 
  select(date, GEOID, d2) |> 
  distinct()

all_bgs <- full_join(all_bgs, homes) |> 
  mutate(dist = ifelse(is.na(d2), dist5, d2)) |> 
  select(-d2)

saveRDS(all_bgs, "temp/dists_long_new_from_gva.rds")



